This files contains an example of tuning a Logistic Regression model with BayesSearchCV
import pickle
import time
import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import plotly.io as pio
pio.renderers.default='notebook'
with open('../X_train.pkl', 'rb') as handle:
X_train = pickle.load(handle)
with open('../y_train.pkl', 'rb') as handle:
y_train = pickle.load(handle)
hlp.pandas.numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | # of Zeros | % Zeros | Mean | St Dev. | Coef of Var | Skewness | Kurtosis | Min | 10% | 25% | 50% | 75% | 90% | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| duration | 760 | 40 | 5.0% | 0 | 0.0% | 21.0 | 11.7 | 0.6 | 1.0 | 0.6 | 4.0 | 9.0 | 12.0 | 18.0 | 24.0 | 36.0 | 60.0 |
| credit_amount | 800 | 0 | 0.0% | 38 | 5.0% | 3,203.9 | 2,932.3 | 0.9 | 1.9 | 3.9 | 0.0 | 753.9 | 1,300.8 | 2,236.5 | 3,951.5 | 7,394.6 | 18,424.0 |
| installment_commitment | 800 | 0 | 0.0% | 0 | 0.0% | 3.0 | 1.1 | 0.4 | -0.5 | -1.2 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| residence_since | 800 | 0 | 0.0% | 0 | 0.0% | 2.9 | 1.1 | 0.4 | -0.3 | -1.4 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| age | 800 | 0 | 0.0% | 0 | 0.0% | 35.6 | 11.4 | 0.3 | 1.0 | 0.7 | 19.0 | 23.0 | 27.0 | 33.0 | 42.0 | 52.0 | 75.0 |
| existing_credits | 800 | 0 | 0.0% | 0 | 0.0% | 1.4 | 0.6 | 0.4 | 1.3 | 1.6 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | 4.0 |
| num_dependents | 800 | 0 | 0.0% | 0 | 0.0% | 1.1 | 0.3 | 0.3 | 2.0 | 2.1 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 |
hlp.pandas.non_numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | Most Freq. Value | # of Unique | % Unique | |
|---|---|---|---|---|---|---|
| checking_status | 763 | 37 | 4.6% | no checking | 4 | 0.5% |
| credit_history | 800 | 0 | 0.0% | existing paid | 5 | 0.6% |
| purpose | 800 | 0 | 0.0% | radio/tv | 10 | 1.2% |
| savings_status | 800 | 0 | 0.0% | <100 | 5 | 0.6% |
| employment | 800 | 0 | 0.0% | 1<=X<4 | 5 | 0.6% |
| personal_status | 800 | 0 | 0.0% | male single | 4 | 0.5% |
| other_parties | 800 | 0 | 0.0% | none | 3 | 0.4% |
| property_magnitude | 800 | 0 | 0.0% | car | 4 | 0.5% |
| other_payment_plans | 800 | 0 | 0.0% | none | 3 | 0.4% |
| housing | 800 | 0 | 0.0% | own | 3 | 0.4% |
| job | 800 | 0 | 0.0% | skilled | 4 | 0.5% |
| own_telephone | 800 | 0 | 0.0% | none | 2 | 0.2% |
| foreign_worker | 800 | 0 | 0.0% | yes | 2 | 0.2% |
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
np.unique(y_train, return_counts=True)
(array([0, 1]), array([559, 241]))
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])
array([0.69875, 0.30125])
search_space = hlp.sklearn_search.LogisticBayesianSearchSpace(random_state=42)
# pip install scikit-optimize
from skopt import BayesSearchCV
#from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RepeatedKFold
bayes_search = BayesSearchCV(
estimator=search_space.pipeline(data=X_train),
search_spaces=search_space.search_spaces(),
cv=RepeatedKFold(n_splits=5, n_repeats=2, random_state=42),
scoring='roc_auc',
n_jobs=-1,
verbose=1,
random_state=42,
)
start_time = time.time()
bayes_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time
print(f"Elapsed time to run BayesSearchCV: {elapsed_time:.3f} seconds; {elapsed_time / 60:.1f} minutes")
Elapsed time to run BayesSearchCV: 60.994 seconds; 1.0 minutes
print(bayes_search.best_score_)
0.7700573739011715
print(bayes_search.best_params_)
OrderedDict([('model', LogisticRegression(C=0.06672457466363003, max_iter=1000, random_state=42)), ('model__C', 0.06672457466363003), ('prep__non_numeric__encoder__transformer', OneHotEncoder(handle_unknown='ignore')), ('prep__numeric__imputer__transformer', SimpleImputer(strategy='most_frequent')), ('prep__numeric__scaler__transformer', StandardScaler())])
results = hlp.sklearn_eval.MLExperimentResults.from_sklearn_search_cv(
searcher=bayes_search,
higher_score_is_better = True,
parameter_name_mappings = search_space.param_name_mappings()
)
results.to_yaml_file(yaml_file_name = 'Run 1 - Logistic Regression - BayesSearchCV.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = 'Run 1 - Logistic Regression - BayesSearchCV.yaml')
results.best_score
0.7700573739011715
results.best_params
{'model': 'LogisticRegression()',
'C': 0.06672457466363003,
'imputer': "SimpleImputer(strategy='most_frequent')",
'scaler': 'StandardScaler()',
'encoder': 'OneHotEncoder()'}
results.to_formatted_dataframe(num_rows=100, include_rank=True)
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | C | imputer | scaler | encoder |
|---|---|---|---|---|---|---|---|
| 1 | 0.770 | 0.750 | 0.790 | 0.067 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 2 | 0.770 | 0.750 | 0.790 | 0.067 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 3 | 0.770 | 0.750 | 0.790 | 0.067 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 4 | 0.770 | 0.751 | 0.789 | 0.074 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 5 | 0.770 | 0.751 | 0.789 | 0.074 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 6 | 0.770 | 0.751 | 0.789 | 0.074 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 7 | 0.770 | 0.750 | 0.789 | 0.073 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 8 | 0.770 | 0.750 | 0.789 | 0.073 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 9 | 0.770 | 0.750 | 0.789 | 0.065 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 10 | 0.770 | 0.750 | 0.789 | 0.086 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 11 | 0.768 | 0.748 | 0.788 | 0.127 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 12 | 0.765 | 0.735 | 0.795 | 0.087 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 13 | 0.765 | 0.735 | 0.795 | 0.097 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 14 | 0.765 | 0.735 | 0.795 | 0.085 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 15 | 0.764 | 0.735 | 0.794 | 0.132 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 16 | 0.764 | 0.736 | 0.793 | 0.152 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 17 | 0.764 | 0.735 | 0.793 | 0.143 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 18 | 0.762 | 0.740 | 0.784 | 0.011 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 19 | 0.762 | 0.732 | 0.792 | 0.024 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 20 | 0.760 | 0.740 | 0.780 | 0.710 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 21 | 0.757 | 0.738 | 0.777 | <NA> | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 22 | 0.757 | 0.728 | 0.786 | 0.004 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 23 | 0.756 | 0.735 | 0.778 | 1.596 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 24 | 0.756 | 0.727 | 0.784 | 0.001 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 25 | 0.755 | 0.727 | 0.784 | 0.000 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 26 | 0.755 | 0.727 | 0.784 | 0.000 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 27 | 0.755 | 0.727 | 0.784 | 0.000 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 28 | 0.755 | 0.727 | 0.784 | 0.000 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 29 | 0.751 | 0.733 | 0.768 | 3.509 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 30 | 0.747 | 0.730 | 0.764 | 11.655 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 31 | 0.745 | 0.728 | 0.762 | 22.913 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 32 | 0.745 | 0.728 | 0.762 | 22.376 | SimpleImputer(strategy='median') | StandardScaler() | OneHotEncoder() |
| 33 | 0.744 | 0.727 | 0.761 | 32.731 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 34 | 0.742 | 0.725 | 0.760 | 99.350 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 35 | 0.742 | 0.725 | 0.759 | 99.282 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 36 | 0.734 | 0.704 | 0.764 | 0.001 | SimpleImputer(strategy='median') | StandardScaler() | OneHotEncoder() |
| 37 | 0.733 | 0.714 | 0.752 | 0.019 | SimpleImputer(strategy='most_frequent') | StandardScaler() | CustomOrdinalEncoder() |
| 38 | 0.729 | 0.698 | 0.760 | 0.000 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 39 | 0.728 | 0.708 | 0.749 | 0.001 | SimpleImputer() | StandardScaler() | CustomOrdinalEncoder() |
| 40 | 0.728 | 0.697 | 0.759 | 0.000 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 41 | 0.726 | 0.708 | 0.744 | 0.403 | SimpleImputer(strategy='median') | StandardScaler() | CustomOrdinalEncoder() |
| 42 | 0.725 | 0.708 | 0.743 | 3.489 | SimpleImputer(strategy='median') | MinMaxScaler() | CustomOrdinalEncoder() |
| 43 | 0.725 | 0.707 | 0.744 | 7.766 | SimpleImputer(strategy='most_frequent') | StandardScaler() | CustomOrdinalEncoder() |
| 44 | 0.725 | 0.707 | 0.743 | 99.377 | SimpleImputer(strategy='median') | StandardScaler() | CustomOrdinalEncoder() |
| 45 | 0.725 | 0.707 | 0.743 | 100.000 | SimpleImputer() | MinMaxScaler() | CustomOrdinalEncoder() |
| 46 | 0.722 | 0.700 | 0.745 | 0.000 | SimpleImputer(strategy='most_frequent') | StandardScaler() | CustomOrdinalEncoder() |
| 47 | 0.722 | 0.699 | 0.744 | 0.000 | SimpleImputer() | StandardScaler() | CustomOrdinalEncoder() |
| 48 | 0.720 | 0.699 | 0.741 | 0.131 | SimpleImputer() | MinMaxScaler() | CustomOrdinalEncoder() |
| 49 | 0.709 | 0.686 | 0.731 | 0.003 | SimpleImputer(strategy='median') | MinMaxScaler() | CustomOrdinalEncoder() |
| 50 | 0.699 | 0.675 | 0.724 | 0.000 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | CustomOrdinalEncoder() |
| 51 | 0.699 | 0.675 | 0.724 | 0.000 | SimpleImputer() | MinMaxScaler() | CustomOrdinalEncoder() |
results.plot_performance_across_trials().show()
results.plot_performance_across_trials(size=None, color='C').show()
results.plot_performance_across_trials(size='C', color='scaler').show()
results.plot_parameter_values_across_trials().show()
results.plot_scatter_matrix(height=800, width=800 * hlp.plot.GOLDEN_RATIO).show()
results.plot_performance_numeric_params()
results.plot_parallel_coordinates().show()
results.plot_performance_non_numeric_params()
results.plot_score_vs_parameter(
parameter='C',
color='scaler'
)
results.plot_score_vs_parameter(
parameter='C',
color='encoder'
)
roc_auc Mean¶score_variable = results.primary_score_name + ' Mean'
score_dataframe = results.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
if x not in [score_variable] + results.parameter_names])
score_dataframe.head()
| roc_auc Mean | C | imputer | scaler | encoder | |
|---|---|---|---|---|---|
| 47 | 0.770057 | 0.066725 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 49 | 0.770057 | 0.067141 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 44 | 0.770056 | 0.066860 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 34 | 0.769830 | 0.074034 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 33 | 0.769725 | 0.073866 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names
{'roc_auc Mean': 'roc_auc_Mean',
'C': 'C',
'imputer': 'imputer',
'scaler': 'scaler',
'encoder': 'encoder'}
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)
import statsmodels.formula.api as smf
y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")
formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ C + imputer + scaler + encoder
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.706
Model: OLS Adj. R-squared: 0.673
Method: Least Squares F-statistic: 21.18
Date: Tue, 15 Feb 2022 Prob (F-statistic): 1.00e-10
Time: 10:00:34 Log-Likelihood: 154.69
No. Observations: 50 AIC: -297.4
Df Residuals: 44 BIC: -285.9
Df Model: 5
Covariance Type: nonrobust
======================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------
Intercept 0.7213 0.005 158.139 0.000 0.712 0.731
imputer[T.SimpleImputer(strategy='median')] -0.0070 0.006 -1.218 0.230 -0.018 0.005
imputer[T.SimpleImputer(strategy='most_frequent')] -0.0007 0.004 -0.199 0.843 -0.008 0.007
scaler[T.StandardScaler()] 0.0042 0.003 1.225 0.227 -0.003 0.011
encoder[T.OneHotEncoder()] 0.0356 0.004 8.804 0.000 0.027 0.044
C -8.045e-05 6.22e-05 -1.294 0.202 -0.000 4.48e-05
==============================================================================
Omnibus: 14.241 Durbin-Watson: 1.251
Prob(Omnibus): 0.001 Jarque-Bera (JB): 15.353
Skew: -1.249 Prob(JB): 0.000464
Kurtosis: 4.062 Cond. No. 120.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)
numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)
print(numeric_columns)
print(non_numeric_columns)
numeric_pipeline = Pipeline([
('scaling', StandardScaler()),
])
transformations_pipeline = ColumnTransformer([
('numeric_pipeline', numeric_pipeline, numeric_columns),
('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])
score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed.head()
['roc_auc_Mean', 'C'] ['imputer', 'scaler', 'encoder']
| roc_auc_Mean | C | imputer | scaler | encoder | |
|---|---|---|---|---|---|
| 0 | 1.093033 | -0.371301 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 1 | 1.093009 | -0.371286 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 2 | 1.092983 | -0.371296 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 3 | 1.081691 | -0.371032 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 4 | 1.07647 | -0.371038 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['C'] = score_dataframe_transformed['C'].astype('float')
print(formula)
model = smf.ols(formula=formula,
data = score_dataframe_transformed)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ C + imputer + scaler + encoder
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.706
Model: OLS Adj. R-squared: 0.673
Method: Least Squares F-statistic: 21.18
Date: Tue, 15 Feb 2022 Prob (F-statistic): 1.00e-10
Time: 10:00:34 Log-Likelihood: -40.693
No. Observations: 50 AIC: 93.39
Df Residuals: 44 BIC: 104.9
Df Model: 5
Covariance Type: nonrobust
======================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------
Intercept -1.3737 0.222 -6.191 0.000 -1.821 -0.927
imputer[T.SimpleImputer(strategy='median')] -0.3461 0.284 -1.218 0.230 -0.918 0.226
imputer[T.SimpleImputer(strategy='most_frequent')] -0.0358 0.180 -0.199 0.843 -0.398 0.326
scaler[T.StandardScaler()] 0.2069 0.169 1.225 0.227 -0.133 0.547
encoder[T.OneHotEncoder()] 1.7713 0.201 8.804 0.000 1.366 2.177
C -0.1086 0.084 -1.294 0.202 -0.278 0.061
==============================================================================
Omnibus: 14.241 Durbin-Watson: 1.251
Prob(Omnibus): 0.001 Jarque-Bera (JB): 15.353
Skew: -1.249 Prob(JB): 0.000464
Kurtosis: 4.062 Cond. No. 6.09
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
coefficients = pd.DataFrame({
'feature': results.params.index,
'coefficient': results.params,
'p_value': results.pvalues,
})
coefficients = coefficients.query("feature != 'Intercept'")
coefficients['Stat Sig'] = coefficients['p_value'] <= 0.05
coefficients
| feature | coefficient | p_value | Stat Sig | |
|---|---|---|---|---|
| imputer[T.SimpleImputer(strategy='median')] | imputer[T.SimpleImputer(strategy='median')] | -0.346063 | 2.295481e-01 | False |
| imputer[T.SimpleImputer(strategy='most_frequent')] | imputer[T.SimpleImputer(strategy='most_frequen... | -0.035841 | 8.428244e-01 | False |
| scaler[T.StandardScaler()] | scaler[T.StandardScaler()] | 0.206899 | 2.270944e-01 | False |
| encoder[T.OneHotEncoder()] | encoder[T.OneHotEncoder()] | 1.771329 | 2.910803e-11 | True |
| C | C | -0.108623 | 2.024019e-01 | False |
score_variable
'roc_auc Mean'
px.bar(
data_frame=coefficients.reindex(coefficients['coefficient'].abs().sort_values(ascending=True).index),
y='feature',
x='coefficient',
color='Stat Sig',
title=f"Regression Coefficients of Hyper-parameters against '{score_variable}'",
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
from sklearn.inspection import permutation_importance
forest = bayes_search.best_estimator_['model']
start_time = time.time()
result = permutation_importance(
bayes_search.best_estimator_, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)
Elapsed time to compute the importances: 3.937 seconds
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()
temp = X_train.copy()
temp['default'] = y_train
temp.groupby('foreign_worker').agg({'default': np.mean})
| default | |
|---|---|
| foreign_worker | |
| yes | 0.308290 |
| no | 0.107143 |
fig = px.box(
data_frame=temp,
y='age',
x='default',
# size=size_variable,
# color=color_variable,
# trendline='lowess',
# labels={
# score_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
# },
# title=f"<b>{x_variable}</b> - Performance<br>" \
# f"<sup>Size of point corresponds to '{size_variable}'</sup>",
# custom_data=['labels'],
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
fig.show()
NOTE: foreign worker seems like it should be important but is ranked last in feature importance.